# Importing required libraries
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" # To set the echo option to False
import warnings
warnings.filterwarnings("ignore") # To set the warning option to False
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
%matplotlib inline
sns.set_style('darkgrid')
#from google.colab import drive
#drive.mount('/content/drive/')
#path='/content/drive/My Drive/Colab Notebooks/PML/Final_Project/online_retail_II.csv'
df= pd.read_csv('online_retail_II.csv')
df.head(10)
| Invoice | StockCode | Description | Quantity | InvoiceDate | Price | Customer ID | Country | |
|---|---|---|---|---|---|---|---|---|
| 0 | 489434 | 85048 | 15CM CHRISTMAS GLASS BALL 20 LIGHTS | 12 | 2009-12-01 07:45:00 | 6.95 | 13085.0 | United Kingdom |
| 1 | 489434 | 79323P | PINK CHERRY LIGHTS | 12 | 2009-12-01 07:45:00 | 6.75 | 13085.0 | United Kingdom |
| 2 | 489434 | 79323W | WHITE CHERRY LIGHTS | 12 | 2009-12-01 07:45:00 | 6.75 | 13085.0 | United Kingdom |
| 3 | 489434 | 22041 | RECORD FRAME 7" SINGLE SIZE | 48 | 2009-12-01 07:45:00 | 2.10 | 13085.0 | United Kingdom |
| 4 | 489434 | 21232 | STRAWBERRY CERAMIC TRINKET BOX | 24 | 2009-12-01 07:45:00 | 1.25 | 13085.0 | United Kingdom |
| 5 | 489434 | 22064 | PINK DOUGHNUT TRINKET POT | 24 | 2009-12-01 07:45:00 | 1.65 | 13085.0 | United Kingdom |
| 6 | 489434 | 21871 | SAVE THE PLANET MUG | 24 | 2009-12-01 07:45:00 | 1.25 | 13085.0 | United Kingdom |
| 7 | 489434 | 21523 | FANCY FONT HOME SWEET HOME DOORMAT | 10 | 2009-12-01 07:45:00 | 5.95 | 13085.0 | United Kingdom |
| 8 | 489435 | 22350 | CAT BOWL | 12 | 2009-12-01 07:46:00 | 2.55 | 13085.0 | United Kingdom |
| 9 | 489435 | 22349 | DOG BOWL , CHASING BALL DESIGN | 12 | 2009-12-01 07:46:00 | 3.75 | 13085.0 | United Kingdom |
print("Data size: ", df.shape)
Data size: (1067371, 8)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1067371 entries, 0 to 1067370 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Invoice 1067371 non-null object 1 StockCode 1067371 non-null object 2 Description 1062989 non-null object 3 Quantity 1067371 non-null int64 4 InvoiceDate 1067371 non-null object 5 Price 1067371 non-null float64 6 Customer ID 824364 non-null float64 7 Country 1067371 non-null object dtypes: float64(2), int64(1), object(5) memory usage: 65.1+ MB
Data type change
'Customer ID' is numerical which needs to be changed to 'object'
'InvoiceDate' to be converted to datetime
df['Customer ID']=df['Customer ID'].astype('object')
df['InvoiceDate']=pd.to_datetime(df['InvoiceDate'])
numCols = list(df.select_dtypes(exclude='object').columns)
print(f"There are {len(numCols)} numerical features:\n", numCols)
There are 3 numerical features: ['Quantity', 'InvoiceDate', 'Price']
catCols = list(df.select_dtypes(include='object').columns)
print(f"There are {len(catCols)} Non numerical features:\n", catCols)
There are 5 Non numerical features: ['Invoice', 'StockCode', 'Description', 'Customer ID', 'Country']
# Rename the following columns:
# Invoice to InvoiceNo
# Customer ID to CustomerID
# Price to UnitPrice
df.rename(columns={'Invoice':'InvoiceNo', 'Customer ID':'CustomerID', 'Price':'UnitPrice', 'StockCode':'ProductCode'}, inplace=True)
df.head()
| InvoiceNo | ProductCode | Description | Quantity | InvoiceDate | UnitPrice | CustomerID | Country | |
|---|---|---|---|---|---|---|---|---|
| 0 | 489434 | 85048 | 15CM CHRISTMAS GLASS BALL 20 LIGHTS | 12 | 2009-12-01 07:45:00 | 6.95 | 13085.0 | United Kingdom |
| 1 | 489434 | 79323P | PINK CHERRY LIGHTS | 12 | 2009-12-01 07:45:00 | 6.75 | 13085.0 | United Kingdom |
| 2 | 489434 | 79323W | WHITE CHERRY LIGHTS | 12 | 2009-12-01 07:45:00 | 6.75 | 13085.0 | United Kingdom |
| 3 | 489434 | 22041 | RECORD FRAME 7" SINGLE SIZE | 48 | 2009-12-01 07:45:00 | 2.10 | 13085.0 | United Kingdom |
| 4 | 489434 | 21232 | STRAWBERRY CERAMIC TRINKET BOX | 24 | 2009-12-01 07:45:00 | 1.25 | 13085.0 | United Kingdom |
df['total_spend'] = df['UnitPrice'] * df['Quantity']
df['refund'] = df['InvoiceNo'].str.contains("^C")
Updated Variable Descriptions:
refund - This variable indicates whether an observation corresponds to a canceled transaction with a refunded amount. Refunded transactions are identifiable by the presence of the letter "C" as the initial character in the invoice number value.
total_spend - The total earnings associated with each individual transaction are represented by this variable.
df.head()
| InvoiceNo | ProductCode | Description | Quantity | InvoiceDate | UnitPrice | CustomerID | Country | total_spend | refund | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 489434 | 85048 | 15CM CHRISTMAS GLASS BALL 20 LIGHTS | 12 | 2009-12-01 07:45:00 | 6.95 | 13085.0 | United Kingdom | 83.4 | False |
| 1 | 489434 | 79323P | PINK CHERRY LIGHTS | 12 | 2009-12-01 07:45:00 | 6.75 | 13085.0 | United Kingdom | 81.0 | False |
| 2 | 489434 | 79323W | WHITE CHERRY LIGHTS | 12 | 2009-12-01 07:45:00 | 6.75 | 13085.0 | United Kingdom | 81.0 | False |
| 3 | 489434 | 22041 | RECORD FRAME 7" SINGLE SIZE | 48 | 2009-12-01 07:45:00 | 2.10 | 13085.0 | United Kingdom | 100.8 | False |
| 4 | 489434 | 21232 | STRAWBERRY CERAMIC TRINKET BOX | 24 | 2009-12-01 07:45:00 | 1.25 | 13085.0 | United Kingdom | 30.0 | False |
Certain evident errors are present, such as instances of extremely high order quantities that were subsequently canceled. Additionally, there are empty transactions characterized by both a unit price of zero and a missing customer ID value. Given their negligible value, these observations will be excluded.
fig = plt.figure(figsize=(10, 7))
plt.boxplot(df['total_spend'])
plt.show();
df['total_spend'].describe()
count 1.067371e+06 mean 1.806987e+01 std 2.924202e+02 min -1.684696e+05 25% 3.750000e+00 50% 9.900000e+00 75% 1.770000e+01 max 1.684696e+05 Name: total_spend, dtype: float64
df[df['total_spend']<0]['refund'].value_counts()
True 19493 False 5 Name: refund, dtype: int64
df = df[df['total_spend']>0]
fig = plt.figure(figsize=(10, 7))
plt.boxplot(df['total_spend'])
plt.show();
df['total_spend'].describe(percentiles=[.1,.2,.3,.4,.5,.6,.7,.8,.9,.99])
count 1.041671e+06 mean 2.013397e+01 std 2.031167e+02 min 1.000000e-03 10% 1.700000e+00 20% 3.290000e+00 30% 4.950000e+00 40% 6.800000e+00 50% 9.960000e+00 60% 1.320000e+01 70% 1.635000e+01 80% 1.980000e+01 90% 3.300000e+01 99% 1.836000e+02 max 1.684696e+05 Name: total_spend, dtype: float64
df = df[df['total_spend']<=df['total_spend'].describe(percentiles=[.99]).values[0]]
df
| InvoiceNo | ProductCode | Description | Quantity | InvoiceDate | UnitPrice | CustomerID | Country | total_spend | refund | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 489434 | 85048 | 15CM CHRISTMAS GLASS BALL 20 LIGHTS | 12 | 2009-12-01 07:45:00 | 6.95 | 13085.0 | United Kingdom | 83.40 | False |
| 1 | 489434 | 79323P | PINK CHERRY LIGHTS | 12 | 2009-12-01 07:45:00 | 6.75 | 13085.0 | United Kingdom | 81.00 | False |
| 2 | 489434 | 79323W | WHITE CHERRY LIGHTS | 12 | 2009-12-01 07:45:00 | 6.75 | 13085.0 | United Kingdom | 81.00 | False |
| 3 | 489434 | 22041 | RECORD FRAME 7" SINGLE SIZE | 48 | 2009-12-01 07:45:00 | 2.10 | 13085.0 | United Kingdom | 100.80 | False |
| 4 | 489434 | 21232 | STRAWBERRY CERAMIC TRINKET BOX | 24 | 2009-12-01 07:45:00 | 1.25 | 13085.0 | United Kingdom | 30.00 | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1067366 | 581587 | 22899 | CHILDREN'S APRON DOLLY GIRL | 6 | 2011-12-09 12:50:00 | 2.10 | 12680.0 | France | 12.60 | False |
| 1067367 | 581587 | 23254 | CHILDRENS CUTLERY DOLLY GIRL | 4 | 2011-12-09 12:50:00 | 4.15 | 12680.0 | France | 16.60 | False |
| 1067368 | 581587 | 23255 | CHILDRENS CUTLERY CIRCUS PARADE | 4 | 2011-12-09 12:50:00 | 4.15 | 12680.0 | France | 16.60 | False |
| 1067369 | 581587 | 22138 | BAKING SET 9 PIECE RETROSPOT | 3 | 2011-12-09 12:50:00 | 4.95 | 12680.0 | France | 14.85 | False |
| 1067370 | 581587 | POST | POSTAGE | 1 | 2011-12-09 12:50:00 | 18.00 | 12680.0 | France | 18.00 | False |
1041671 rows × 10 columns
df.drop(columns ='refund',inplace=True)
# # Remove obvious transaction errors
# transaction_errors = df[(df['total_spend'] < -5000) | (df['total_spend'] > 5000)]['total_spend']
# df = df[~df['total_spend'].isin(transaction_errors)]
# # Remove 'empty' transactions: Total spend equals 0.00
# df = df[df['total_spend'] != 0.00]
By employing the Product Code and Description variables, we can identify rows containing non-transactional data, such as samples or other non-sales entries.
#checking number of unique Product Codes
df['ProductCode'].nunique()
4917
# Non-transactional stock code & description values
for x in range(1, 10):
j = df[df['ProductCode'].str.len() == x]
code_counts = j['ProductCode'].value_counts().head()
print("ProductCode with character length :",x)
code_counts
ProductCode with character length : 1
M 877 m 5 D 5 S 3 B 1 Name: ProductCode, dtype: int64
ProductCode with character length : 2
C2 272 Name: ProductCode, dtype: int64
ProductCode with character length : 3
DOT 1436 Name: ProductCode, dtype: int64
ProductCode with character length : 4
POST 1886 PADS 17 Name: ProductCode, dtype: int64
ProductCode with character length : 5
22423 4061 21212 3290 20725 3179 84879 2938 47566 2740 Name: ProductCode, dtype: int64
ProductCode with character length : 6
85123A 5686 85099B 4124 82494L 2063 85099C 1926 85099F 1885 Name: ProductCode, dtype: int64
ProductCode with character length : 7
15056BL 910 79323LP 170 15056bl 93 79323GR 83 TEST001 9 Name: ProductCode, dtype: int64
ProductCode with character length : 8
DCGS0058 30 DCGSSBOY 21 DCGS0076 15 DCGS0003 13 DCGS0069 5 Name: ProductCode, dtype: int64
ProductCode with character length : 9
DCGSSGIRL 23 AMAZONFEE 4 DCGS0066N 2 Name: ProductCode, dtype: int64
# Non-transactional Product code & description values
for x in range(1, 10):
j = df[df['ProductCode'].str.len() == x]
description_counts = j['Description'].value_counts().head(10)
print("ProductCode with character length :",x)
description_counts
ProductCode with character length : 1
Manual 882 Discount 5 SAMPLES 3 Adjust bad debt 1 Name: Description, dtype: int64
ProductCode with character length : 2
CARRIAGE 272 Name: Description, dtype: int64
ProductCode with character length : 3
DOTCOM POSTAGE 1436 Name: Description, dtype: int64
ProductCode with character length : 4
POSTAGE 1886 PADS TO MATCH ALL CUSHIONS 17 Name: Description, dtype: int64
ProductCode with character length : 5
REGENCY CAKESTAND 3 TIER 4061 ASSORTED COLOUR BIRD ORNAMENT 2938 PARTY BUNTING 2740 LUNCH BAG BLACK SKULL. 2484 STRAWBERRY CERAMIC TRINKET BOX 2427 JUMBO STORAGE BAG SUKI 2398 HEART OF WICKER SMALL 2293 JUMBO SHOPPER VINTAGE RED PAISLEY 2274 60 TEATIME FAIRY CAKE CASES 2257 PAPER CHAIN KIT 50'S CHRISTMAS 2198 Name: Description, dtype: int64
ProductCode with character length : 6
WHITE HANGING HEART T-LIGHT HOLDER 5778 JUMBO BAG RED RETROSPOT 3391 WOODEN FRAME ANTIQUE WHITE 2151 JUMBO BAG BAROQUE BLACK WHITE 1936 JUMBO BAG STRAWBERRY 1912 HANGING HEART ZINC T-LIGHT HOLDER 1566 RED WOOLLY HOTTIE WHITE HEART. 1265 SINGLE HEART ZINC T-LIGHT HOLDER 1186 EDWARDIAN PARASOL NATURAL 1177 KNITTED UNION FLAG HOT WATER BOTTLE 1113 Name: Description, dtype: int64
ProductCode with character length : 7
EDWARDIAN PARASOL BLACK 1003 LIGHT PINK CHERRY LIGHTS 170 GREEN CHERRY LIGHTS 83 This is a test product. 10 Adjustment by Peter on Jun 25 2010 3 SET/3 FLORAL GARDEN TOOLS IN BAG 1 Name: Description, dtype: int64
ProductCode with character length : 8
MISO PRETTY GUM 30 BOYS PARTY BAG 21 SUNJAR LED NIGHT NIGHT LIGHT 15 BOXED GLASS ASHTRAY 13 OOH LA LA DOGS COLLAR 5 HAYNES CAMPER SHOULDER BAG 4 CAT CAMOUFLAGUE COLLAR 3 DOGS NIGHT COLLAR 2 CAMOUFLAGE DOG COLLAR 2 HANDZ-OFF CAR FRESHENER 1 Name: Description, dtype: int64
ProductCode with character length : 9
GIRLS PARTY BAG 23 AMAZON FEE 4 NAVY CUDDLES DOG HOODIE 2 Name: Description, dtype: int64
Counting the most common product codes according to the number of characters in it. Product codes with values of D,M,S represent discounts, manual input and samples respectively.
Removing incorrect Product Codes.
# Amazon Fee
amazon_fee = df[df['ProductCode'] == 'AMAZONFEE']
amazon_fee_total = round(amazon_fee['total_spend'].sum())
formatted_amazon_fee_total = '{:,}'.format(amazon_fee_total)
print(f"Total Amazon Fee cost\nOutgoing expense labeled ‘AMAZONFEE’ totals £{formatted_amazon_fee_total}")
# Postage Fee
postage_fee = df[df['ProductCode'] == 'POST']
postage_fee_total = round(postage_fee['total_spend'].sum())
formatted_postage_fee_total = '{:,}'.format(postage_fee_total)
print(f"Revenue labeled ‘POST’ recorded in primarily non UK transactions totals £{formatted_postage_fee_total}")
Total Amazon Fee cost Outgoing expense labeled ‘AMAZONFEE’ totals £34,009 Revenue labeled ‘POST’ recorded in primarily non UK transactions totals £127,597
# Remove chosen values
df = df[~df['Description'].isin(['Sample', 'Adjust bad debt','POSTAGE','This is a test product',\
'Adjustment by Peter on Jun 25 2010','AMAZON FEE'])]
df.isnull().sum()
InvoiceNo 0 ProductCode 0 Description 0 Quantity 0 InvoiceDate 0 UnitPrice 0 CustomerID 236069 Country 0 total_spend 0 dtype: int64
def missing(dff):
print (round((dff.isnull().sum() * 100/ len(dff)),2).sort_values(ascending=False))
missing(df)
CustomerID 22.7 InvoiceNo 0.0 ProductCode 0.0 Description 0.0 Quantity 0.0 InvoiceDate 0.0 UnitPrice 0.0 Country 0.0 total_spend 0.0 dtype: float64
As anticipated, there are 236069 missing entries for the CustomerID column. The absent data in the CustomerID column accounts for 22.7% of the total online customers.
Missing values appear only in two columns: CustomerID and Description of product. It would be meaningless to fill NAs of customerid and description features with an aggreagated value as they are not meaningful measures for our purpose. But because customer ids are important for segmentation, this problem should be handled.
We prefer dropping missing values altogether since we still end up having a large dataset for clustering.
df_data = df.dropna()
df_data.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 803711 entries, 0 to 1067369 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 InvoiceNo 803711 non-null object 1 ProductCode 803711 non-null object 2 Description 803711 non-null object 3 Quantity 803711 non-null int64 4 InvoiceDate 803711 non-null datetime64[ns] 5 UnitPrice 803711 non-null float64 6 CustomerID 803711 non-null object 7 Country 803711 non-null object 8 total_spend 803711 non-null float64 dtypes: datetime64[ns](1), float64(2), int64(1), object(5) memory usage: 61.3+ MB
df_data.describe() #Some quantity entries are lower than 0, this should be handled first hand
| Quantity | UnitPrice | total_spend | |
|---|---|---|---|
| count | 803711.000000 | 803711.000000 | 803711.000000 |
| mean | 13.314281 | 3.145862 | 21.919404 |
| std | 143.797332 | 27.700719 | 224.086301 |
| min | 1.000000 | 0.001000 | 0.001000 |
| 25% | 2.000000 | 1.250000 | 4.950000 |
| 50% | 5.000000 | 1.950000 | 11.800000 |
| 75% | 12.000000 | 3.750000 | 19.500000 |
| max | 80995.000000 | 10953.500000 | 168469.600000 |
df_data.dtypes
InvoiceNo object ProductCode object Description object Quantity int64 InvoiceDate datetime64[ns] UnitPrice float64 CustomerID object Country object total_spend float64 dtype: object
# there remained no missing data
def missing(dff):
print (round((dff.isnull().sum() * 100/ len(dff)),2).sort_values(ascending=False))
missing(df_data)
InvoiceNo 0.0 ProductCode 0.0 Description 0.0 Quantity 0.0 InvoiceDate 0.0 UnitPrice 0.0 CustomerID 0.0 Country 0.0 total_spend 0.0 dtype: float64
df_data['CustomerID'].isnull().value_counts()
False 803711 Name: CustomerID, dtype: int64
len(df_data)
df_data["ProductCode"].nunique()
803711
4630
# from datetime import datetime, timedelta
# # Create a copy of the cleaned DataFrame
# synthetic_df = pd.DataFrame(columns=['CustomerID','DOB','Age','MembershipFlag'])
# # Generate synthetic Date of Birth (DOB) for unique customers
# unique_customers = df_data['CustomerID'].unique()
# synthetic_df['CustomerID'] = unique_customers
# def generate_dob():
# start_date = datetime(1985, 1, 1)
# end_date = datetime(2005, 12, 31)
# dob = np.random.randint(start_date.timestamp(), end_date.timestamp())
# return pd.to_datetime(dob, unit='s')
# dob_mapping = {customer: generate_dob() for customer in unique_customers}
# synthetic_df['DOB'] = synthetic_df['CustomerID'].map(dob_mapping)
# # Calculate Age based on DOB
# current_date = datetime.today()
# synthetic_df['Age'] = (current_date - synthetic_df['DOB']).astype('<m8[Y]')
# # Generate Membership flag
# membership_options = ['member', 'non-member', 'premium']
# synthetic_df['MembershipFlag'] = np.random.choice(membership_options, size=len(synthetic_df))
# # Display the first few rows of the synthetic DataFrame
# synthetic_df.head()
# Export synthetic_df to a CSV file
#synthetic_df.to_csv('synthetic_data.csv', index=False)
synthetic_df= pd.read_csv('synthetic_data.csv')
# Concatenate synthetic_df and df_data into a new DataFrame
#merged_df = pd.concat([synthetic_df, df_data], ignore_index=True)
merged_df = df_data.merge(synthetic_df,on='CustomerID')
# Display information about the merged DataFrame
merged_df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 803711 entries, 0 to 803710 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 InvoiceNo 803711 non-null object 1 ProductCode 803711 non-null object 2 Description 803711 non-null object 3 Quantity 803711 non-null int64 4 InvoiceDate 803711 non-null datetime64[ns] 5 UnitPrice 803711 non-null float64 6 CustomerID 803711 non-null object 7 Country 803711 non-null object 8 total_spend 803711 non-null float64 9 DOB 803711 non-null object 10 Age 803711 non-null float64 11 MembershipFlag 803711 non-null object dtypes: datetime64[ns](1), float64(3), int64(1), object(7) memory usage: 79.7+ MB
Let's start by visualizing the distribution of your numerical features: 'Quantity', 'UnitPrice', and 'CustomerID'.
import matplotlib.pyplot as plt
import seaborn as sns
# Set style
sns.set(style="whitegrid")
# Plot distribution of Quantity
plt.figure(figsize=(10, 6))
sns.histplot(merged_df['Quantity'], bins=50, kde=True)
plt.title('Distribution of Quantity')
plt.xlabel('Quantity')
plt.ylabel('Frequency')
plt.show()
# Plot distribution of UnitPrice
plt.figure(figsize=(10, 6))
sns.histplot(df_data['UnitPrice'], bins=50, kde=True)
plt.title('Distribution of Unit Price')
plt.xlabel('Unit Price')
plt.ylabel('Frequency')
plt.show()
# Distribution of CustomerID is categorical, so let's calculate the counts
customer_count = merged_df['CustomerID'].value_counts()
plt.figure(figsize=(10, 6))
sns.histplot(customer_count, bins=50, kde=True)
plt.title('Distribution of Customer Counts')
plt.xlabel('Number of Transactions')
plt.ylabel('Frequency')
plt.show();
Let's find out the top products and countries based on their frequencies.
# Top 10 Products
top_products = merged_df['ProductCode'].value_counts().head(10)
plt.figure(figsize=(10, 6))
sns.barplot(x=top_products.index, y=top_products.values)
plt.title('Top 10 Products')
plt.xlabel('Product Code')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()
# Top 10 Countries
top_countries = merged_df['Country'].value_counts().head(10)
plt.figure(figsize=(10, 6))
sns.barplot(x=top_countries.index, y=top_countries.values)
plt.title('Top 10 Countries')
plt.xlabel('Country')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show();
Let's analyze the monthly sales trends to see if there are any seasonal patterns.
# Convert 'Quantity' column to numeric
merged_df['Quantity'] = pd.to_numeric(merged_df['Quantity'], errors='coerce')
# Extract year and month from InvoiceDate
merged_df['YearMonth'] = merged_df['InvoiceDate'].dt.to_period('M')
# Group by YearMonth and calculate total sales
monthly_sales = merged_df.groupby('YearMonth')['Quantity'].sum()
# Convert YearMonth to string for plotting
monthly_sales.index = monthly_sales.index.astype(str)
# Create a basic line plot using matplotlib
plt.figure(figsize=(12, 6))
plt.plot(monthly_sales.index, monthly_sales.values)
plt.title('Monthly Sales Trends')
plt.xlabel('Year-Month')
plt.ylabel('Total Sales')
plt.xticks(rotation=45)
plt.show();
plt.hist(merged_df['Age'], bins=20, edgecolor='k')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.title('Distribution of Age')
plt.show()
(array([50959., 36586., 44353., 34304., 36113., 35850., 35703., 29850.,
37612., 35007., 36315., 39402., 42773., 44151., 38189., 45976.,
56140., 31053., 38584., 54791.]),
array([17. , 18.05, 19.1 , 20.15, 21.2 , 22.25, 23.3 , 24.35, 25.4 ,
26.45, 27.5 , 28.55, 29.6 , 30.65, 31.7 , 32.75, 33.8 , 34.85,
35.9 , 36.95, 38. ]),
<BarContainer object of 20 artists>)
Text(0.5, 0, 'Age')
Text(0, 0.5, 'Frequency')
Text(0.5, 1.0, 'Distribution of Age')
sns.boxplot(x='MembershipFlag', y='Age', data=merged_df)
plt.xlabel('Membership Type')
plt.ylabel('Age')
plt.title('Box plot of Age by Membership Type')
plt.show();
# Calculate the number of online customers and their countries of origin
online_customers = merged_df['CustomerID'].nunique()
countries = merged_df.groupby('CustomerID')['Country'].first().value_counts()
print(f"From the dataset, the online retail shop has {online_customers} customers from {len(countries)} different countries.")
From the dataset, the online retail shop has 5877 customers from 41 different countries.
# Create a DataFrame to summarize customer counts by country
ctm_cntry_df = countries.reset_index()
ctm_cntry_df.columns = ['Country', 'CustomerCount']
# Calculate the percentage of customer representation in each country
ctm_cntry_df['Percentage'] = np.round(ctm_cntry_df['CustomerCount'] / online_customers * 100, 2)
# Set a threshold for the percentage margin for categorizing countries as 'Other Countries'
percent_margin = 0.25
# Create a new column 'CountryCategory' and assign country names or 'Other Countries'
ctm_cntry_df['CountryCategory'] = ctm_cntry_df['Country']
ctm_cntry_df.loc[ctm_cntry_df['Percentage'] <= percent_margin, 'CountryCategory'] = 'Other Countries'
import plotly.express as px
# Create a bar chart using plotly
bar_fig = px.bar(ctm_cntry_df,
x="CountryCategory",
y="Percentage",
title="Customer Country Count in Percentage",
template="plotly_dark",
color="CountryCategory",
labels={"CountryCategory": "Country Category", "Percentage": "Percentage (%)"},
height=500
)
bar_fig.update_layout(title_x=0,
xaxis_tickangle=-45, # Rotate x-axis labels for better readability
legend_title="Countries Represented"
)
import pandas as pd
import numpy as np
import plotly.express as px
# Calculate revenue for each month
merged_df['InvoiceYearMonth'] = merged_df['InvoiceDate'].map(lambda date: 100*date.year + date.month)
merged_df['Revenue'] = merged_df['UnitPrice'] * merged_df['Quantity']
# Calculate monthly revenue
ctm_revenue = merged_df.groupby('InvoiceYearMonth')['Revenue'].sum().reset_index()
# Display revenue statistics
print(pd.DataFrame(ctm_revenue['Revenue'].describe()))
# Create a line plot for monthly revenue
line_fig = px.line(ctm_revenue,
x="InvoiceYearMonth",
y="Revenue",
title="Monthly Revenue from Dec. 2009 to Dec. 2011",
template="plotly_dark",
labels={"InvoiceYearMonth": "Invoice Year-Month", "Revenue": "Monthly Revenue"}
)
line_fig.update_layout(title_x=0.5,
showlegend=False,
xaxis={"type": "category"}
)
Revenue count 2.500000e+01 mean 7.046746e+05 std 2.099564e+05 min 4.439713e+05 25% 5.888012e+05 50% 6.355014e+05 75% 8.272310e+05 max 1.164223e+06
# Calculate revenue and percentage revenue by country
cntry_revenue_df = merged_df.groupby('Country')['Revenue'].sum().reset_index().sort_values(by='Revenue', ascending=False)
cntry_revenue_df['Percentage'] = np.round(cntry_revenue_df['Revenue'] / cntry_revenue_df['Revenue'].sum() * 100, 2)
# Set a threshold for the percentage margin for categorizing countries as 'Other Countries'
percent_margin = 0.25
# Create a new column 'CountryCategory' and assign country names or 'Other Countries'
cntry_revenue_df['CountryCategory'] = cntry_revenue_df['Country']
cntry_revenue_df.loc[cntry_revenue_df['Percentage'] <= percent_margin, 'CountryCategory'] = 'Other Countries'
# Create a pie chart for country revenue percentage
pie_fig = px.pie(cntry_revenue_df,
names="CountryCategory",
values="Percentage",
title="Country Revenue in Percentage",
template="plotly_dark",
labels={"CountryCategory": "Country Category", "Percentage": "Percentage (%)"}
)
pie_fig.update_layout(title_x=0,
legend_title="Countries Represented",
legend=dict(orientation="h")
)
The dataframe has been divided into two segments.
The initial sub-dataframe, designated by the Python variable "ctm_bhvr_dt," encompasses customer purchases made between 01-12-2009 and 30-08-2011. This particular dataframe will serve as the basis for analyzing the purchasing behavior of online customers.
The subsequent sub-dataframe, attributed to the Python variable "ctm_next_quarter," will be utilized to scrutinize the purchasing behavior of customers within the subsequent quarter timeframe, i.e., spanning from 01-09-2011 to 30-11-2011.
merged_df.head()
| InvoiceNo | ProductCode | Description | Quantity | InvoiceDate | UnitPrice | CustomerID | Country | total_spend | DOB | Age | MembershipFlag | YearMonth | InvoiceYearMonth | Revenue | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 489434 | 85048 | 15CM CHRISTMAS GLASS BALL 20 LIGHTS | 12 | 2009-12-01 07:45:00 | 6.95 | 13085.0 | United Kingdom | 83.4 | 2001-03-06 13:57:31 | 22.0 | member | 2009-12 | 200912 | 83.4 |
| 1 | 489434 | 79323P | PINK CHERRY LIGHTS | 12 | 2009-12-01 07:45:00 | 6.75 | 13085.0 | United Kingdom | 81.0 | 2001-03-06 13:57:31 | 22.0 | member | 2009-12 | 200912 | 81.0 |
| 2 | 489434 | 79323W | WHITE CHERRY LIGHTS | 12 | 2009-12-01 07:45:00 | 6.75 | 13085.0 | United Kingdom | 81.0 | 2001-03-06 13:57:31 | 22.0 | member | 2009-12 | 200912 | 81.0 |
| 3 | 489434 | 22041 | RECORD FRAME 7" SINGLE SIZE | 48 | 2009-12-01 07:45:00 | 2.10 | 13085.0 | United Kingdom | 100.8 | 2001-03-06 13:57:31 | 22.0 | member | 2009-12 | 200912 | 100.8 |
| 4 | 489434 | 21232 | STRAWBERRY CERAMIC TRINKET BOX | 24 | 2009-12-01 07:45:00 | 1.25 | 13085.0 | United Kingdom | 30.0 | 2001-03-06 13:57:31 | 22.0 | member | 2009-12 | 200912 | 30.0 |
# Define the date ranges
start_date_ctm_bhvr = pd.to_datetime('2009-12-01')
end_date_ctm_bhvr = pd.to_datetime('2011-08-30')
start_date_ctm_next_quarter = pd.to_datetime('2011-09-01')
end_date_ctm_next_quarter = pd.to_datetime('2011-11-30')
# Filter the data based on date ranges
customer_bhvr_dt = merged_df[(merged_df['InvoiceDate'] >= start_date_ctm_bhvr) & (merged_df['InvoiceDate'] <= end_date_ctm_bhvr)]
customer_next_quarter = merged_df[(merged_df['InvoiceDate'] >= start_date_ctm_next_quarter) & (merged_df['InvoiceDate'] <= end_date_ctm_next_quarter)]
# Get the distinct customers in the dataframe ctm_bhvr_dt
customer_dt = pd.DataFrame(customer_bhvr_dt ['CustomerID'].unique())
# Rename the column to CustomerID.
customer_dt.columns = ['CustomerID']
customer_dt.head()
| CustomerID | |
|---|---|
| 0 | 13085.0 |
| 1 | 13078.0 |
| 2 | 15362.0 |
| 3 | 18102.0 |
| 4 | 12682.0 |
customer_dt = customer_dt.merge(synthetic_df[['CustomerID','Age','MembershipFlag']])
customer_dt
| CustomerID | Age | MembershipFlag | |
|---|---|---|---|
| 0 | 13085.0 | 22.0 | member |
| 1 | 13078.0 | 34.0 | non-member |
| 2 | 15362.0 | 31.0 | non-member |
| 3 | 18102.0 | 31.0 | member |
| 4 | 12682.0 | 32.0 | member |
| ... | ... | ... | ... |
| 5236 | 13644.0 | 22.0 | non-member |
| 5237 | 17451.0 | 27.0 | premium |
| 5238 | 13812.0 | 34.0 | non-member |
| 5239 | 14141.0 | 31.0 | non-member |
| 5240 | 17183.0 | 27.0 | non-member |
5241 rows × 3 columns
customer_dt = pd.concat([customer_dt[['CustomerID','Age']],pd.get_dummies(customer_dt['MembershipFlag'])], axis=1)
customer_dt
| CustomerID | Age | member | non-member | premium | |
|---|---|---|---|---|---|
| 0 | 13085.0 | 22.0 | 1 | 0 | 0 |
| 1 | 13078.0 | 34.0 | 0 | 1 | 0 |
| 2 | 15362.0 | 31.0 | 0 | 1 | 0 |
| 3 | 18102.0 | 31.0 | 1 | 0 | 0 |
| 4 | 12682.0 | 32.0 | 1 | 0 | 0 |
| ... | ... | ... | ... | ... | ... |
| 5236 | 13644.0 | 22.0 | 0 | 1 | 0 |
| 5237 | 17451.0 | 27.0 | 0 | 0 | 1 |
| 5238 | 13812.0 | 34.0 | 0 | 1 | 0 |
| 5239 | 14141.0 | 31.0 | 0 | 1 | 0 |
| 5240 | 17183.0 | 27.0 | 0 | 1 | 0 |
5241 rows × 5 columns
Let's find the first purchase made by each customer in the next quarter.
# Group by CustomerID and find the first purchase date
first_purchase_next_quarter = customer_next_quarter.groupby('CustomerID')['InvoiceDate'].min().reset_index()
# Rename the columns for clarity
first_purchase_next_quarter.columns = ['CustomerID', 'FirstPurchaseNextQuarter']
#Converting the CustomerID to Object
first_purchase_next_quarter['CustomerID'] = first_purchase_next_quarter['CustomerID'].astype(int)
first_purchase_next_quarter['CustomerID']=first_purchase_next_quarter['CustomerID'].astype('object')
# Display the first purchase date for each customer in the next quarter
first_purchase_next_quarter.head(10)
| CustomerID | FirstPurchaseNextQuarter | |
|---|---|---|
| 0 | 12347 | 2011-10-31 12:25:00 |
| 1 | 12348 | 2011-09-25 13:13:00 |
| 2 | 12349 | 2011-11-21 09:51:00 |
| 3 | 12352 | 2011-09-20 14:34:00 |
| 4 | 12356 | 2011-11-17 08:40:00 |
| 5 | 12357 | 2011-11-06 16:07:00 |
| 6 | 12359 | 2011-10-13 12:47:00 |
| 7 | 12360 | 2011-10-18 15:22:00 |
| 8 | 12362 | 2011-09-28 12:04:00 |
| 9 | 12364 | 2011-09-22 16:07:00 |
Let's find the last purchase made by each customer in the dataframe customer_bhvr_dt.
# Group by CustomerID and find the last purchase date
last_purchase_dt = customer_bhvr_dt.groupby('CustomerID').InvoiceDate.max().reset_index()
# Rename the columns for clarity
last_purchase_dt.columns = ['CustomerID', 'LastPurchaseDate']
#Converting the CustomerID to Object
last_purchase_dt['CustomerID'] = last_purchase_dt['CustomerID'].astype(int)
last_purchase_dt['CustomerID']=last_purchase_dt['CustomerID'].astype('object')
# Display the last purchase date for each customer
last_purchase_dt.head(10)
| CustomerID | LastPurchaseDate | |
|---|---|---|
| 0 | 12346 | 2011-01-18 10:01:00 |
| 1 | 12347 | 2011-08-02 08:48:00 |
| 2 | 12348 | 2011-04-05 10:47:00 |
| 3 | 12349 | 2010-10-28 08:23:00 |
| 4 | 12350 | 2011-02-02 16:01:00 |
| 5 | 12351 | 2010-11-29 15:23:00 |
| 6 | 12352 | 2011-03-22 16:08:00 |
| 7 | 12353 | 2011-05-19 17:47:00 |
| 8 | 12354 | 2011-04-21 13:11:00 |
| 9 | 12355 | 2011-05-09 13:49:00 |
Let's merge the two dataframes last_purchase_dt and first_purchase_next_quarter
customer_purchase_dates = pd.merge(last_purchase_dt, first_purchase_next_quarter, on='CustomerID', how='left')
# Display the merged DataFrame
customer_purchase_dates.head(10)
| CustomerID | LastPurchaseDate | FirstPurchaseNextQuarter | |
|---|---|---|---|
| 0 | 12346 | 2011-01-18 10:01:00 | NaT |
| 1 | 12347 | 2011-08-02 08:48:00 | 2011-10-31 12:25:00 |
| 2 | 12348 | 2011-04-05 10:47:00 | 2011-09-25 13:13:00 |
| 3 | 12349 | 2010-10-28 08:23:00 | 2011-11-21 09:51:00 |
| 4 | 12350 | 2011-02-02 16:01:00 | NaT |
| 5 | 12351 | 2010-11-29 15:23:00 | NaT |
| 6 | 12352 | 2011-03-22 16:08:00 | 2011-09-20 14:34:00 |
| 7 | 12353 | 2011-05-19 17:47:00 | NaT |
| 8 | 12354 | 2011-04-21 13:11:00 | NaT |
| 9 | 12355 | 2011-05-09 13:49:00 | NaT |
Let's calculate the time difference in days between customer's last purchase in the dataframe last_purchase_dt and the first purchase in the dataframe first_purchase_next_quarter
# Convert date columns to datetime objects
customer_purchase_dates['LastPurchaseDate'] = pd.to_datetime(customer_purchase_dates['LastPurchaseDate'])
customer_purchase_dates['FirstPurchaseNextQuarter'] = pd.to_datetime(customer_purchase_dates['FirstPurchaseNextQuarter'])
# Calculate the time difference in days
customer_purchase_dates['NextPurchaseDay'] = (customer_purchase_dates['FirstPurchaseNextQuarter'] - customer_purchase_dates['LastPurchaseDate']).dt.days
# Display the DataFrame with the time differences
customer_purchase_dates.head(10)
| CustomerID | LastPurchaseDate | FirstPurchaseNextQuarter | NextPurchaseDay | |
|---|---|---|---|---|
| 0 | 12346.0 | 2011-01-18 10:01:00 | NaT | NaN |
| 1 | 12347.0 | 2011-08-02 08:48:00 | 2011-10-31 12:25:00 | 90.0 |
| 2 | 12348.0 | 2011-04-05 10:47:00 | 2011-09-25 13:13:00 | 173.0 |
| 3 | 12349.0 | 2010-10-28 08:23:00 | 2011-11-21 09:51:00 | 389.0 |
| 4 | 12350.0 | 2011-02-02 16:01:00 | NaT | NaN |
| 5 | 12351.0 | 2010-11-29 15:23:00 | NaT | NaN |
| 6 | 12352.0 | 2011-03-22 16:08:00 | 2011-09-20 14:34:00 | 181.0 |
| 7 | 12353.0 | 2011-05-19 17:47:00 | NaT | NaN |
| 8 | 12354.0 | 2011-04-21 13:11:00 | NaT | NaN |
| 9 | 12355.0 | 2011-05-09 13:49:00 | NaT | NaN |
# merge with Customer
customer_dt = pd.merge(customer_dt, customer_purchase_dates[['CustomerID','NextPurchaseDay']], on='CustomerID', how='left')
customer_dt.head()
| CustomerID | Age | member | non-member | premium | NextPurchaseDay | |
|---|---|---|---|---|---|---|
| 0 | 13085.0 | 22.0 | 1 | 0 | 0 | NaN |
| 1 | 13078.0 | 34.0 | 0 | 1 | 0 | 20.0 |
| 2 | 15362.0 | 31.0 | 0 | 1 | 0 | NaN |
| 3 | 18102.0 | 31.0 | 1 | 0 | 0 | 27.0 |
| 4 | 12682.0 | 32.0 | 1 | 0 | 0 | 21.0 |
Update the dataframe customer_dt by filling all missing values with 9999.
customer_dt = customer_dt.fillna(9999)
customer_dt.head()
| CustomerID | Age | member | non-member | premium | NextPurchaseDay | |
|---|---|---|---|---|---|---|
| 0 | 13085.0 | 22.0 | 1 | 0 | 0 | 9999.0 |
| 1 | 13078.0 | 34.0 | 0 | 1 | 0 | 20.0 |
| 2 | 15362.0 | 31.0 | 0 | 1 | 0 | 9999.0 |
| 3 | 18102.0 | 31.0 | 1 | 0 | 0 | 27.0 |
| 4 | 12682.0 | 32.0 | 1 | 0 | 0 | 21.0 |
Subsequently, our focus will be on defining certain attributes and incorporating them into the "customer_dt" dataframe to establish the foundation for our machine learning model. We intend to employ the Recency-Frequency-Monetary (RFM) segmentation technique, which involves categorizing customers into groups according to the following criteria:
Following this, we will proceed to apply K-means clustering, which will assign customers a score for each of these attributes, facilitating the segmentation process.
To initiate our analysis, we will identify the most recent purchase date for each customer and calculate the number of days of inactivity since their last purchase. Subsequently, we will implement the K-means clustering algorithm to allocate customers a recency score based on these findings.
customer_max_purchase = customer_bhvr_dt.groupby('CustomerID').InvoiceDate.max().reset_index()
customer_max_purchase.columns = ['CustomerID','MaxPurchaseDate']
customer_max_purchase.head()
| CustomerID | MaxPurchaseDate | |
|---|---|---|
| 0 | 12346.0 | 2011-01-18 10:01:00 |
| 1 | 12347.0 | 2011-08-02 08:48:00 |
| 2 | 12348.0 | 2011-04-05 10:47:00 |
| 3 | 12349.0 | 2010-10-28 08:23:00 |
| 4 | 12350.0 | 2011-02-02 16:01:00 |
# Find the recency in days
customer_max_purchase['Recency'] = (customer_max_purchase['MaxPurchaseDate'].max() - customer_max_purchase['MaxPurchaseDate']).dt.days
# Merge the dataframes customer_dt and customer_max_purchase[['CustomerID', 'Recency']] on the CustomerID column.
customer_dt = pd.merge(customer_dt, customer_max_purchase[['CustomerID', 'Recency']], on='CustomerID')
customer_dt.head()
| CustomerID | Age | member | non-member | premium | NextPurchaseDay | Recency | |
|---|---|---|---|---|---|---|---|
| 0 | 13085.0 | 22.0 | 1 | 0 | 0 | 9999.0 | 54 |
| 1 | 13078.0 | 34.0 | 0 | 1 | 0 | 20.0 | 3 |
| 2 | 15362.0 | 31.0 | 0 | 1 | 0 | 9999.0 | 345 |
| 3 | 18102.0 | 31.0 | 1 | 0 | 0 | 27.0 | 23 |
| 4 | 12682.0 | 32.0 | 1 | 0 | 0 | 21.0 | 3 |
customer_dt.Recency.describe()
count 5241.000000 mean 201.832856 std 170.868818 min 0.000000 25% 48.000000 50% 158.000000 75% 315.000000 max 635.000000 Name: Recency, dtype: float64
The mean Recency is approximately 201 days whiles the median is 158 days.
# plot histogram
hist_fig = px.histogram(customer_dt,
x="Recency",
title="Customers Recency in Days",
template= "plotly_dark"
)
hist_fig.update_layout(title_x=0.5,
xaxis_title="Recency in groups of 20 days",
yaxis_title="Number of Customers"
)
hist_fig.show(config={'displaylogo': False});
Moving forward, our assessment will involve understanding customers' purchasing behavior by quantifying the frequency with which they engage in transactions at the online retail shop. This will be achieved by calculating the total number of orders made by each individual customer.
#get order counts for each user and create a dataframe with it
customer_frequency = df_data.groupby('CustomerID').InvoiceDate.count().reset_index()
customer_frequency.columns = ['CustomerID','Frequency']
#add this data to our main ctm_dt
customer_dt = pd.merge(customer_dt, customer_frequency, on='CustomerID')
customer_dt.head()
| CustomerID | Age | member | non-member | premium | NextPurchaseDay | Recency | Frequency | |
|---|---|---|---|---|---|---|---|---|
| 0 | 13085.0 | 22.0 | 1 | 0 | 0 | 9999.0 | 54 | 84 |
| 1 | 13078.0 | 34.0 | 0 | 1 | 0 | 20.0 | 3 | 801 |
| 2 | 15362.0 | 31.0 | 0 | 1 | 0 | 9999.0 | 345 | 40 |
| 3 | 18102.0 | 31.0 | 1 | 0 | 0 | 27.0 | 23 | 1058 |
| 4 | 12682.0 | 32.0 | 1 | 0 | 0 | 21.0 | 3 | 989 |
customer_dt.Frequency.describe()
count 5241.000000 mean 146.678115 std 365.889770 min 1.000000 25% 23.000000 50% 59.000000 75% 155.000000 max 12890.000000 Name: Frequency, dtype: float64
# plot histogram
hist_fig = px.histogram(x=customer_dt.query('Frequency < 1200')['Frequency'],
title="Customers with Purchase Frequency less than 1200",
template= "plotly_dark"
)
hist_fig.update_layout(title_x=0.5,
xaxis_title="Customer Frequency Purchase in groups of 20",
yaxis_title="Number of Customers"
)
hist_fig.show(config={'displaylogo': False});
customer_revenue = merged_df.groupby('CustomerID').Revenue.sum().reset_index()
#merge it with our customer_dt
customer_dt = pd.merge(customer_dt, customer_revenue, on='CustomerID')
customer_dt.head()
| CustomerID | Age | member | non-member | premium | NextPurchaseDay | Recency | Frequency | Revenue | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 13085.0 | 22.0 | 1 | 0 | 0 | 9999.0 | 54 | 84 | 2433.28 |
| 1 | 13078.0 | 34.0 | 0 | 1 | 0 | 20.0 | 3 | 801 | 29532.45 |
| 2 | 15362.0 | 31.0 | 0 | 1 | 0 | 9999.0 | 345 | 40 | 613.08 |
| 3 | 18102.0 | 31.0 | 1 | 0 | 0 | 27.0 | 23 | 1058 | 608821.65 |
| 4 | 12682.0 | 32.0 | 1 | 0 | 0 | 21.0 | 3 | 989 | 21693.91 |
# plot histogram
hist_fig = px.histogram(x=customer_dt.query('Revenue < 10000')['Revenue'],
title="Customers with Monetary Value below 10000",
template= "plotly_dark"
)
hist_fig.update_layout(title_x=0.5,
xaxis_title="Customers Revenue",
yaxis_title="Number of Customers"
)
hist_fig.show(config={'displaylogo': False});
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
# # Normalize RFM values
scaler = StandardScaler()
rfm_values = customer_dt[['Recency', 'Frequency', 'Revenue']]
normalized_rfm = scaler.fit_transform(rfm_values)
# Apply K-means clustering
num_clusters = 5 # You can choose the number of clusters based on your preference
kmeans = KMeans(n_clusters=num_clusters, random_state=0)
customer_dt['Cluster'] = kmeans.fit_predict(normalized_rfm)
# Display the clustered DataFrame
customer_dt.head()
| CustomerID | Age | member | non-member | premium | NextPurchaseDay | Recency | Frequency | Revenue | Cluster | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 13085.0 | 22.0 | 1 | 0 | 0 | 9999.0 | 54 | 84 | 2433.28 | 0 |
| 1 | 13078.0 | 34.0 | 0 | 1 | 0 | 20.0 | 3 | 801 | 29532.45 | 3 |
| 2 | 15362.0 | 31.0 | 0 | 1 | 0 | 9999.0 | 345 | 40 | 613.08 | 1 |
| 3 | 18102.0 | 31.0 | 1 | 0 | 0 | 27.0 | 23 | 1058 | 608821.65 | 2 |
| 4 | 12682.0 | 32.0 | 1 | 0 | 0 | 21.0 | 3 | 989 | 21693.91 | 3 |
customer_dt.groupby('Cluster')['Recency','Frequency','Revenue'].mean()
| Recency | Frequency | Revenue | |
|---|---|---|---|
| Cluster | |||
| 0 | 81.117627 | 142.057946 | 2738.352099 |
| 1 | 378.633796 | 54.959722 | 914.166266 |
| 2 | 12.000000 | 2975.000000 | 483173.180000 |
| 3 | 31.958333 | 1009.197917 | 27883.975901 |
| 4 | 1.000000 | 9481.250000 | 113453.317500 |
To achieve our objective of predicting whether a customer will make a purchase in the upcoming quarter, we will introduce a new column named "NextPurchaseDayRange" into the dataset. The values within this column will be categorized as either 1 or 0, in accordance with the following criteria:
A value of 1 signifies that the customer is expected to make a purchase within the next quarter, calculated from their last purchase date. A value of 0 signifies that the customer is projected to make a purchase more than a certain number of days beyond the last purchase date.
#create ctm_class as a copy of ctm_dt before applying get_dummies
customer_class = customer_dt.copy()
customer_class['CustomerID']=customer_class['CustomerID'].astype('object')
customer_class['NextPurchaseDayRange'] = 1 ## less than 3 months
customer_class.loc[customer_class.NextPurchaseDay>90,'NextPurchaseDayRange'] = 0 # more than 3 months
customer_class.head()
| CustomerID | Age | member | non-member | premium | NextPurchaseDay | Recency | Frequency | Revenue | Cluster | NextPurchaseDayRange | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 13085.0 | 22.0 | 1 | 0 | 0 | 9999.0 | 54 | 84 | 2433.28 | 0 | 0 |
| 1 | 13078.0 | 34.0 | 0 | 1 | 0 | 20.0 | 3 | 801 | 29532.45 | 3 | 1 |
| 2 | 15362.0 | 31.0 | 0 | 1 | 0 | 9999.0 | 345 | 40 | 613.08 | 1 | 0 |
| 3 | 18102.0 | 31.0 | 1 | 0 | 0 | 27.0 | 23 | 1058 | 608821.65 | 2 | 1 |
| 4 | 12682.0 | 32.0 | 1 | 0 | 0 | 21.0 | 3 | 989 | 21693.91 | 3 | 1 |
Finally in this section, let's see the correlation between our features and label. We can achieve this by applying the corr method to the dataframe customer_dt.
corr_matrix = customer_class[customer_class.columns].corr()
corr_df = pd.DataFrame(corr_matrix.min())
corr_df.columns = ['MinCorrelationCoeff']
corr_df['MaxCorrelationCoeff'] = corr_matrix[corr_matrix < 1].max()
corr_df
| MinCorrelationCoeff | MaxCorrelationCoeff | |
|---|---|---|
| Age | -0.011961 | 0.027553 |
| member | -0.512112 | 0.024354 |
| non-member | -0.512112 | 0.035098 |
| premium | -0.496009 | 0.001042 |
| NextPurchaseDay | -0.552031 | 0.438988 |
| Recency | -0.489989 | 0.451111 |
| Frequency | -0.270456 | 0.466935 |
| Revenue | -0.155728 | 0.466935 |
| Cluster | -0.018193 | 0.451111 |
| NextPurchaseDayRange | -0.552031 | 0.324937 |
plt.figure(figsize = (20, 15))
sns.heatmap(corr_matrix, annot = True, linewidths=0.2, fmt=".2f");
from sklearn.model_selection import train_test_split
data = customer_class#.set_index('CustomerID')
# customer_class = customer_class.drop('NextPurchaseDay', axis=1)
sns.countplot(x='NextPurchaseDayRange', data=data)
<Axes: xlabel='NextPurchaseDayRange', ylabel='count'>
data_corr = pd.DataFrame(data.corr())
data_corr = pd.DataFrame(data_corr['NextPurchaseDayRange'])
# # Remove specific indices, all 3 are categorical
# indices_to_remove = ['Liability-Assets Flag', 'Net Income Flag','Bankrupt']
data_corr = data_corr.drop('NextPurchaseDayRange')
plt.figure(figsize=(5, 10))
sns.barplot(y=data_corr.index,x=data_corr['NextPurchaseDayRange'])
plt.title("Pearson correllation with NextPurchaseDayRange")
plt.show()
<Figure size 500x1000 with 0 Axes>
<Axes: xlabel='NextPurchaseDayRange'>
Text(0.5, 1.0, 'Pearson correllation with NextPurchaseDayRange')
# Lets see what features has weak correlation to strong correlation (>|0.10|)
temp_corr = data_corr
temp_corr[['NextPurchaseDayRange']] = abs(temp_corr[['NextPurchaseDayRange']])
print("\nColumns with correlation (>|0.10|) :\n")
for i in temp_corr[(temp_corr["NextPurchaseDayRange"] > 0.10)].index:
print("* "+i+"\t")
Columns with correlation (>|0.10|) : * NextPurchaseDay * Recency * Frequency * Revenue
from sklearn.feature_selection import mutual_info_classif
independent_variable = data.drop(['NextPurchaseDayRange'], axis=1)
target_variable = data[['NextPurchaseDayRange']]
importances = mutual_info_classif(independent_variable,pd.Series.ravel(target_variable))
importances = pd.Series(importances,independent_variable.columns[0:len(independent_variable.columns)])
importances = pd.DataFrame({'features':importances.index, 'importance':importances.values})
# Mutual information
plt.figure(figsize=(10, 17))
sns.barplot(data = importances,y = "features", x = "importance",order=importances.sort_values('importance').features)
plt.xlabel("Mutual Importance")
plt.title("Mutual Importance of Columns")
plt.show()
<Figure size 1000x1700 with 0 Axes>
<Axes: xlabel='importance', ylabel='features'>
Text(0.5, 0, 'Mutual Importance')
Text(0.5, 1.0, 'Mutual Importance of Columns')
drop 'NextPurchaseDay' as 'NextPurchaseDayRange' is derived from it
X = data.drop(columns = ['NextPurchaseDayRange','NextPurchaseDay','CustomerID'])
y = data['NextPurchaseDayRange']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=None, shuffle=True)
import numpy as np
import pandas as pd
import time
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
# import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
# Create an array of models
models = []
models.append(("LogisticRegression", LogisticRegression()))
models.append(("GaussianNB", GaussianNB()))
models.append(("RandomForestClassifier", RandomForestClassifier()))
models.append(("SVC", SVC()))
models.append(("DecisionTreeClassifier", DecisionTreeClassifier()))
models.append(("GradientBoostingClassifier", GradientBoostingClassifier())),
models.append(("XGBClassifier", XGBClassifier())),
models.append(("KNeighborsClassifier", KNeighborsClassifier()))
# A dictionary for all the distinct models and their respective metrics
model_scores_dict = {'model_name': [],
'accuracy': [],
'f1_score': [],
'recall': [],
'precision': [],
'time': []
}
# For each model name and model in models
for model_name, model in models:
# Add model_name to model_scores_dict
model_scores_dict['model_name'].append(model_name)
kfold = KFold(n_splits=5, random_state=24, shuffle=True)
start = time.time()
# Use cross_val_score to evaluate the model
accuracy_scores = cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy')
f1_scores = cross_val_score(model, X_train, y_train, cv=kfold, scoring='f1_macro')
recall_scores = cross_val_score(model, X_train, y_train, cv=kfold, scoring='recall_macro')
precision_scores = cross_val_score(model, X_train, y_train, cv=kfold, scoring='precision_macro')
# Calculate the average scores
avg_accuracy = np.average(accuracy_scores)
avg_f1 = np.average(f1_scores)
avg_recall = np.average(recall_scores)
avg_precision = np.average(precision_scores)
# Update model_scores_dict with average scores
model_scores_dict['accuracy'].append(avg_accuracy)
model_scores_dict['f1_score'].append(avg_f1)
model_scores_dict['recall'].append(avg_recall)
model_scores_dict['precision'].append(avg_precision)
model_scores_dict['time'].append((time.time() - start))
# Create a DataFrame from the model_scores_dict
model_score_df = pd.DataFrame(model_scores_dict).set_index("model_name")
# Sort the DataFrame by accuracy, f1_score, and time
model_score_df.sort_values(by=["accuracy", "f1_score", "time"], ascending=False, inplace=True)
# Display the DataFrame
model_score_df.reset_index(inplace=True)
model_score_df
(None,)
(None,)
| model_name | accuracy | f1_score | recall | precision | time | |
|---|---|---|---|---|---|---|
| 0 | GradientBoostingClassifier | 0.911023 | 0.849523 | 0.852642 | 0.847148 | 12.596325 |
| 1 | RandomForestClassifier | 0.909113 | 0.846045 | 0.842525 | 0.852890 | 6.127412 |
| 2 | LogisticRegression | 0.901957 | 0.834688 | 0.837658 | 0.832347 | 0.658375 |
| 3 | XGBClassifier | 0.896230 | 0.823216 | 0.822763 | 0.824563 | 5.999934 |
| 4 | DecisionTreeClassifier | 0.878337 | 0.797477 | 0.800166 | 0.803533 | 0.267923 |
| 5 | GaussianNB | 0.866892 | 0.700595 | 0.661149 | 0.838334 | 0.123314 |
| 6 | KNeighborsClassifier | 0.864740 | 0.770328 | 0.770581 | 0.770119 | 0.931404 |
| 7 | SVC | 0.848521 | 0.662133 | 0.633057 | 0.773080 | 7.767550 |
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV
from sklearn.metrics import roc_curve, roc_auc_score, precision_recall_curve
#Gradient Booster Trees Hyperparameter tuning
start = time.time()
# define the hyperparameter grid for GBT
param_grid = {
'n_estimators': [10, 50, 100, 200],
'learning_rate': [0.1, 0.01, 0.001],
'max_depth': [3, 5, 7],
'max_features': ['auto', 'sqrt', 'log2'],
'subsample': [1.0, 0.8, 0.6]
}
# create the classifier
clf = GradientBoostingClassifier(random_state=777)
# create the grid search object
grid_search = RandomizedSearchCV(clf, param_grid, cv=5, scoring='f1',n_iter=10)
# fit the grid search to the data
grid_search.fit(X_train, y_train)
# print the best hyperparameters
print('Best hyperparameters: ', grid_search.best_params_)
# make predictions on the test set
y_pred = grid_search.best_estimator_.predict(X_test)
# evaluate the model
accuracy = accuracy_score(y_test, y_pred)
f1_hyper = f1_score(y_test, y_pred)
# calculate the ROC AUC score
roc_auc = roc_auc_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
# add to results df
# Update model_scores_df with scores
model_score_df = model_score_df.append({
'model_name': 'Gradient Boosting Classifier-HyperParam Tuning',
'accuracy': accuracy,
'f1_score': f1_hyper,
'recall': recall,
'precision':precision,
'time': (time.time() - start),
}, ignore_index=True)
RandomizedSearchCV(cv=5, estimator=GradientBoostingClassifier(random_state=777),
param_distributions={'learning_rate': [0.1, 0.01, 0.001],
'max_depth': [3, 5, 7],
'max_features': ['auto', 'sqrt',
'log2'],
'n_estimators': [10, 50, 100, 200],
'subsample': [1.0, 0.8, 0.6]},
scoring='f1')
Best hyperparameters: {'subsample': 0.6, 'n_estimators': 100, 'max_features': 'sqrt', 'max_depth': 5, 'learning_rate': 0.1}
model_score_df
| model_name | accuracy | f1_score | recall | precision | time | |
|---|---|---|---|---|---|---|
| 0 | GradientBoostingClassifier | 0.911023 | 0.849523 | 0.852642 | 0.847148 | 12.596325 |
| 1 | RandomForestClassifier | 0.909113 | 0.846045 | 0.842525 | 0.852890 | 6.127412 |
| 2 | LogisticRegression | 0.901957 | 0.834688 | 0.837658 | 0.832347 | 0.658375 |
| 3 | XGBClassifier | 0.896230 | 0.823216 | 0.822763 | 0.824563 | 5.999934 |
| 4 | DecisionTreeClassifier | 0.878337 | 0.797477 | 0.800166 | 0.803533 | 0.267923 |
| 5 | GaussianNB | 0.866892 | 0.700595 | 0.661149 | 0.838334 | 0.123314 |
| 6 | KNeighborsClassifier | 0.864740 | 0.770328 | 0.770581 | 0.770119 | 0.931404 |
| 7 | SVC | 0.848521 | 0.662133 | 0.633057 | 0.773080 | 7.767550 |
| 8 | Gradient Boosting Classifier-HyperParam Tuning | 0.906578 | 0.729282 | 0.721311 | 0.737430 | 18.891921 |
Let's see how we could improve the existing model XGB by finding suitable parameters via the process of hyperparameter tuning using GridSearchCV. We will check if the improved XGB Classifier model outperforms the LogisticRegression model.
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV
import xgboost as xgb
parameter = {
'max_depth':range(3,10,2),
'min_child_weight':range(1,5,2)
}
p_grid_search = GridSearchCV(estimator = xgb.XGBClassifier(eval_metric='mlogloss'),
param_grid = parameter,
scoring='accuracy',
n_jobs=-1,
#iid=False,
cv=2
)
p_grid_search.fit(X_train, y_train)
GridSearchCV(cv=2,
estimator=XGBClassifier(base_score=None, booster=None,
callbacks=None, colsample_bylevel=None,
colsample_bynode=None,
colsample_bytree=None,
early_stopping_rounds=None,
enable_categorical=False,
eval_metric='mlogloss', feature_types=None,
gamma=None, gpu_id=None, grow_policy=None,
importance_type=None,
interaction_constraints=None,
learning_rate...ne,
max_cat_threshold=None,
max_cat_to_onehot=None,
max_delta_step=None, max_depth=None,
max_leaves=None, min_child_weight=None,
missing=nan, monotone_constraints=None,
n_estimators=100, n_jobs=None,
num_parallel_tree=None, predictor=None,
random_state=None, ...),
n_jobs=-1,
param_grid={'max_depth': range(3, 10, 2),
'min_child_weight': range(1, 5, 2)},
scoring='accuracy')
p_grid_search.best_params_, p_grid_search.best_score_
({'max_depth': 3, 'min_child_weight': 3}, 0.9029103053435115)
refined_xgb_model = xgb.XGBClassifier(eval_metric='logloss',
max_depth=list(p_grid_search.best_params_.values())[0]-1,
min_child_weight=list(p_grid_search.best_params_.values())[-1]+4
).fit(X_train, y_train)
print('Accuracy of XGB classifier on training set: {:.2f}'.format(refined_xgb_model.score(X_train, y_train)))
print('Accuracy of XGB classifier on test set: {:.2f}'.format(refined_xgb_model.score(X_test[X_train.columns], y_test)))
Accuracy of XGB classifier on training set: 0.93 Accuracy of XGB classifier on test set: 0.91
from sklearn.metrics import roc_curve, precision_recall_curve, confusion_matrix
y_score = refined_xgb_model.predict_proba(X_test)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_test, y_score)
#create precision recall curve
fig, ax = plt.subplots()
ax.plot(recall, precision, color='purple')
#add axis labels to plot
ax.set_title('Precision-Recall Curve')
ax.set_ylabel('Precision')
ax.set_xlabel('Recall')
#display plot
plt.show()
[<matplotlib.lines.Line2D at 0x147f7db6be0>]
Text(0.5, 1.0, 'Precision-Recall Curve')
Text(0, 0.5, 'Precision')
Text(0.5, 0, 'Recall')
As we can see, the improved XGB classifier model is accurate than the LogisticRegression model.
Let us go ahead and predict with these two models.
ref_xgb_pred_y = refined_xgb_model.predict(X_test)
log_reg_pred_y = LogisticRegression().fit(X_train, y_train)
y_score = log_reg_pred_y.predict_proba(X_test)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_test, y_score)
#create precision recall curve
fig, ax = plt.subplots()
ax.plot(recall, precision, color='purple')
#add axis labels to plot
ax.set_title('Precision-Recall Curve')
ax.set_ylabel('Precision')
ax.set_xlabel('Recall')
#display plot
plt.show()
[<matplotlib.lines.Line2D at 0x147d3d00e50>]
Text(0.5, 1.0, 'Precision-Recall Curve')
Text(0, 0.5, 'Precision')
Text(0.5, 0, 'Recall')
Let compute the confusion matrices of these two models with the user-defined function get_confusion_matrix defined below.
def get_confusion_matrix(y_test, y_pred):
"""
Displays the confusion matrix of the input numpy arrays y_test and y_pred.
INPUT:
y_test - A numpy array
y_pred - A numpy array
OUTPUT:
NoneType
"""
data = {'y_Actual': y_test, 'y_Predicted': y_pred}
df = pd.DataFrame(data, columns=['y_Actual','y_Predicted'])
conf_matrix = pd.crosstab(df['y_Actual'], df['y_Predicted'],
rownames=['Actual'],
colnames=['Predicted'])
sns.heatmap(conf_matrix, annot=True, fmt = "d", cmap="Spectral")
plt.show()
Let's get the confusion matrix for y_test and ref_xgb_pred_y, i.e., for the improved XGB classifier model and y_test and log_reg_pred_y, i.e., for the LogisticRegression model.
get_confusion_matrix(np.array(y_test), ref_xgb_pred_y)
log_reg_pred_y=log_reg_pred_y.predict(X_test)
get_confusion_matrix(np.array(y_test), log_reg_pred_y)
Let's check if the refined XGB Classifier outperforms the LogisticRegression for the other metric
# A dictionary of model names with the various metrics
ref_xgb_log_reg_dict = {"model_name" : ["XGBClassifier", "LogisticRegression"],
"Test accuracy" : [accuracy_score(y_test, ref_xgb_pred_y), accuracy_score(y_test, log_reg_pred_y)],
"Test f1_score" : [f1_score(y_test, ref_xgb_pred_y), f1_score(y_test, log_reg_pred_y)],
"Test recall" : [recall_score(y_test, ref_xgb_pred_y), recall_score(y_test, log_reg_pred_y)],
"Test precision" : [precision_score(y_test, ref_xgb_pred_y), precision_score(y_test, log_reg_pred_y)]
}
# Create a dataframe with ref_xgb_log_reg_dict
ref_xgb_log_reg_df = pd.DataFrame(ref_xgb_log_reg_dict).set_index("model_name")
# Order the dataframe ref_xgb_log_reg_df by the metric values in increasing order
ref_xgb_log_reg_df.sort_values(by=["Test accuracy", "Test f1_score", "Test recall", "Test precision"], ascending=False)
| Test accuracy | Test f1_score | Test recall | Test precision | |
|---|---|---|---|---|
| model_name | ||||
| LogisticRegression | 0.908484 | 0.727273 | 0.699454 | 0.757396 |
| XGBClassifier | 0.905624 | 0.724234 | 0.710383 | 0.738636 |
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, roc_auc_score, precision_recall_curve
# Train a logistic regression model
log_reg_pred_y = LogisticRegression().fit(X_train, y_train)
# Calculate probabilities for positive class
y_score = log_reg_pred_y.predict_proba(X_test)[:, 1]
# Calculate ROC curve
fpr, tpr, thresholds_roc = roc_curve(y_test, y_score)
# Calculate AUC
roc_auc = roc_auc_score(y_test, y_score)
# Create ROC curve plot
plt.figure()
plt.plot(fpr, tpr, color='blue', lw=2, label='Test ROC curve (AUC = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Test Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()
<Figure size 640x480 with 0 Axes>
[<matplotlib.lines.Line2D at 0x147889718e0>]
[<matplotlib.lines.Line2D at 0x14788971b20>]
(0.0, 1.0)
(0.0, 1.05)
Text(0.5, 0, 'False Positive Rate')
Text(0, 0.5, 'True Positive Rate')
Text(0.5, 1.0, 'Test Receiver Operating Characteristic (ROC)')
<matplotlib.legend.Legend at 0x14788910ee0>
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score, precision_recall_curve, confusion_matrix
import numpy as np
# Calculate ROC curve
y_score = refined_xgb_model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds_roc = roc_curve(y_test, y_score)
# Calculate AUC
roc_auc = roc_auc_score(y_test, y_score)
# Create ROC curve plot
plt.figure()
plt.plot(fpr, tpr, color='blue', lw=2, label='Test ROC curve (AUC = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Test - Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()
<Figure size 640x480 with 0 Axes>
[<matplotlib.lines.Line2D at 0x14788852400>]
[<matplotlib.lines.Line2D at 0x14788852790>]
(0.0, 1.0)
(0.0, 1.05)
Text(0.5, 0, 'False Positive Rate')
Text(0, 0.5, 'True Positive Rate')
Text(0.5, 1.0, 'Test - Receiver Operating Characteristic (ROC)')
<matplotlib.legend.Legend at 0x14788852880>
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.metrics import precision_score, recall_score, f1_score,roc_auc_score,accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import warnings
# Ignore future warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.layers import Flatten, Dense
from keras.models import Sequential
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import numpy as np
import time
from sklearn.model_selection import KFold
from tensorflow import keras
from keras.layers import Dense
layers = 2
nodes = 10
start_time = time.time()
model = keras.Sequential()
for i in range(layers):
model.add(Dense(nodes, activation='relu'))
model.add(Dense(2, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
kfold = KFold(n_splits=5, random_state=24, shuffle=True)
df = pd.DataFrame(columns = ['Split','Layers-Nodes','Time','Train Accuracy','Train loss','Validation Accuracy','Validation loss'])
split_num = 0
for train_index, val_index in kfold.split(X):
X_train_fold, X_val_fold = X.loc[train_index], X.loc[val_index]
y_train_fold, y_val_fold = y.loc[train_index], y.loc[val_index]
split_num += 1
history = model.fit(X_train_fold, y_train_fold, epochs = 10, validation_data = (X_val_fold, y_val_fold), verbose = 2)
end_time = time.time()
time_str = time.strftime('%H:%M:%S', time.gmtime(end_time - start_time))
train_loss, train_acc = model.evaluate(X_train_fold, y_train_fold)
val_loss, val_acc = model.evaluate(X_val_fold, y_val_fold)
df1 = pd.DataFrame([split_num,str(layers)+"-"+str(nodes),time_str,val_acc,val_loss,train_acc,train_loss]).T
df1.columns = df.columns
df = pd.concat([df,df1],axis=0,ignore_index=True)
df
Epoch 1/10 131/131 - 2s - loss: 23.4184 - accuracy: 0.5742 - val_loss: 3.4464 - val_accuracy: 0.8761 - 2s/epoch - 15ms/step Epoch 2/10 131/131 - 0s - loss: 2.4746 - accuracy: 0.8690 - val_loss: 2.7163 - val_accuracy: 0.8847 - 373ms/epoch - 3ms/step Epoch 3/10 131/131 - 0s - loss: 7.0981 - accuracy: 0.8352 - val_loss: 3.0309 - val_accuracy: 0.8847 - 374ms/epoch - 3ms/step Epoch 4/10 131/131 - 0s - loss: 2.6370 - accuracy: 0.8700 - val_loss: 11.4431 - val_accuracy: 0.8151 - 348ms/epoch - 3ms/step Epoch 5/10 131/131 - 0s - loss: 2.9119 - accuracy: 0.8726 - val_loss: 2.9664 - val_accuracy: 0.8942 - 363ms/epoch - 3ms/step Epoch 6/10 131/131 - 0s - loss: 2.3066 - accuracy: 0.8719 - val_loss: 4.3317 - val_accuracy: 0.8341 - 335ms/epoch - 3ms/step Epoch 7/10 131/131 - 0s - loss: 1.9581 - accuracy: 0.8726 - val_loss: 3.4992 - val_accuracy: 0.8847 - 333ms/epoch - 3ms/step Epoch 8/10 131/131 - 0s - loss: 2.5393 - accuracy: 0.8738 - val_loss: 2.2832 - val_accuracy: 0.8408 - 351ms/epoch - 3ms/step Epoch 9/10 131/131 - 0s - loss: 6.4964 - accuracy: 0.8392 - val_loss: 0.4662 - val_accuracy: 0.8770 - 346ms/epoch - 3ms/step Epoch 10/10 131/131 - 0s - loss: 1.6117 - accuracy: 0.8523 - val_loss: 0.4159 - val_accuracy: 0.8570 - 373ms/epoch - 3ms/step 131/131 [==============================] - 0s 3ms/step - loss: 0.4636 - accuracy: 0.8545 33/33 [==============================] - 0s 3ms/step - loss: 0.4159 - accuracy: 0.8570 Epoch 1/10 132/132 - 1s - loss: 2.1992 - accuracy: 0.8672 - val_loss: 1.2218 - val_accuracy: 0.8903 - 933ms/epoch - 7ms/step Epoch 2/10 132/132 - 0s - loss: 2.6518 - accuracy: 0.8796 - val_loss: 2.2854 - val_accuracy: 0.8865 - 324ms/epoch - 2ms/step Epoch 3/10 132/132 - 0s - loss: 4.5571 - accuracy: 0.8669 - val_loss: 1.2554 - val_accuracy: 0.8893 - 330ms/epoch - 3ms/step Epoch 4/10 132/132 - 0s - loss: 2.8876 - accuracy: 0.8755 - val_loss: 1.1729 - val_accuracy: 0.8903 - 330ms/epoch - 3ms/step Epoch 5/10 132/132 - 0s - loss: 7.2405 - accuracy: 0.8517 - val_loss: 13.6605 - val_accuracy: 0.8197 - 332ms/epoch - 3ms/step Epoch 6/10 132/132 - 0s - loss: 2.5498 - accuracy: 0.8908 - val_loss: 1.7071 - val_accuracy: 0.8483 - 335ms/epoch - 3ms/step Epoch 7/10 132/132 - 0s - loss: 3.8583 - accuracy: 0.8781 - val_loss: 8.0918 - val_accuracy: 0.8149 - 393ms/epoch - 3ms/step Epoch 8/10 132/132 - 0s - loss: 4.1534 - accuracy: 0.8779 - val_loss: 1.1211 - val_accuracy: 0.8817 - 335ms/epoch - 3ms/step Epoch 9/10 132/132 - 0s - loss: 1.9821 - accuracy: 0.8901 - val_loss: 0.6724 - val_accuracy: 0.8931 - 334ms/epoch - 3ms/step Epoch 10/10 132/132 - 0s - loss: 0.6110 - accuracy: 0.8912 - val_loss: 0.5807 - val_accuracy: 0.8893 - 326ms/epoch - 2ms/step 132/132 [==============================] - 0s 2ms/step - loss: 0.7005 - accuracy: 0.9008 33/33 [==============================] - 0s 2ms/step - loss: 0.5807 - accuracy: 0.8893 Epoch 1/10 132/132 - 0s - loss: 3.7604 - accuracy: 0.8650 - val_loss: 1.0156 - val_accuracy: 0.8989 - 390ms/epoch - 3ms/step Epoch 2/10 132/132 - 0s - loss: 1.4335 - accuracy: 0.8798 - val_loss: 0.3975 - val_accuracy: 0.8960 - 336ms/epoch - 3ms/step Epoch 3/10 132/132 - 0s - loss: 6.5310 - accuracy: 0.8433 - val_loss: 16.9364 - val_accuracy: 0.8101 - 330ms/epoch - 2ms/step Epoch 4/10 132/132 - 0s - loss: 4.7215 - accuracy: 0.8777 - val_loss: 0.9438 - val_accuracy: 0.9036 - 335ms/epoch - 3ms/step Epoch 5/10 132/132 - 0s - loss: 2.4926 - accuracy: 0.8920 - val_loss: 0.9300 - val_accuracy: 0.9084 - 329ms/epoch - 2ms/step Epoch 6/10 132/132 - 0s - loss: 1.9289 - accuracy: 0.8936 - val_loss: 0.8596 - val_accuracy: 0.8950 - 329ms/epoch - 2ms/step Epoch 7/10 132/132 - 0s - loss: 7.6746 - accuracy: 0.8579 - val_loss: 1.4856 - val_accuracy: 0.8826 - 331ms/epoch - 3ms/step Epoch 8/10 132/132 - 0s - loss: 2.7283 - accuracy: 0.8843 - val_loss: 2.3386 - val_accuracy: 0.8931 - 352ms/epoch - 3ms/step Epoch 9/10 132/132 - 0s - loss: 1.6662 - accuracy: 0.8951 - val_loss: 0.7977 - val_accuracy: 0.9113 - 345ms/epoch - 3ms/step Epoch 10/10 132/132 - 0s - loss: 4.8245 - accuracy: 0.8777 - val_loss: 1.6967 - val_accuracy: 0.9046 - 348ms/epoch - 3ms/step 132/132 [==============================] - 0s 2ms/step - loss: 2.4105 - accuracy: 0.8974 33/33 [==============================] - 0s 2ms/step - loss: 1.6967 - accuracy: 0.9046 Epoch 1/10 132/132 - 0s - loss: 1.6959 - accuracy: 0.8939 - val_loss: 0.8806 - val_accuracy: 0.8931 - 404ms/epoch - 3ms/step Epoch 2/10 132/132 - 0s - loss: 1.4332 - accuracy: 0.8951 - val_loss: 0.7725 - val_accuracy: 0.8912 - 358ms/epoch - 3ms/step Epoch 3/10 132/132 - 0s - loss: 1.2644 - accuracy: 0.8843 - val_loss: 1.5791 - val_accuracy: 0.8912 - 346ms/epoch - 3ms/step Epoch 4/10 132/132 - 0s - loss: 0.9555 - accuracy: 0.8858 - val_loss: 1.5349 - val_accuracy: 0.8225 - 330ms/epoch - 3ms/step Epoch 5/10 132/132 - 0s - loss: 1.9965 - accuracy: 0.8765 - val_loss: 1.0518 - val_accuracy: 0.9055 - 326ms/epoch - 2ms/step Epoch 6/10 132/132 - 0s - loss: 8.7452 - accuracy: 0.8321 - val_loss: 0.6563 - val_accuracy: 0.9036 - 330ms/epoch - 3ms/step Epoch 7/10 132/132 - 0s - loss: 5.2551 - accuracy: 0.8481 - val_loss: 1.9562 - val_accuracy: 0.8855 - 326ms/epoch - 2ms/step Epoch 8/10 132/132 - 0s - loss: 3.0774 - accuracy: 0.8700 - val_loss: 0.7273 - val_accuracy: 0.8836 - 324ms/epoch - 2ms/step Epoch 9/10 132/132 - 0s - loss: 0.8941 - accuracy: 0.8886 - val_loss: 0.6131 - val_accuracy: 0.8950 - 324ms/epoch - 2ms/step Epoch 10/10 132/132 - 0s - loss: 1.3208 - accuracy: 0.8898 - val_loss: 0.5698 - val_accuracy: 0.9036 - 327ms/epoch - 2ms/step 132/132 [==============================] - 0s 2ms/step - loss: 0.5778 - accuracy: 0.8970 33/33 [==============================] - 0s 3ms/step - loss: 0.5698 - accuracy: 0.9036 Epoch 1/10 132/132 - 1s - loss: 0.6725 - accuracy: 0.8974 - val_loss: 1.3273 - val_accuracy: 0.9055 - 504ms/epoch - 4ms/step Epoch 2/10 132/132 - 1s - loss: 1.3065 - accuracy: 0.8891 - val_loss: 1.8200 - val_accuracy: 0.9017 - 555ms/epoch - 4ms/step Epoch 3/10 132/132 - 1s - loss: 0.5736 - accuracy: 0.8915 - val_loss: 2.0191 - val_accuracy: 0.8845 - 614ms/epoch - 5ms/step Epoch 4/10 132/132 - 1s - loss: 1.6942 - accuracy: 0.8862 - val_loss: 0.5665 - val_accuracy: 0.8941 - 657ms/epoch - 5ms/step Epoch 5/10 132/132 - 0s - loss: 0.5394 - accuracy: 0.8955 - val_loss: 0.4745 - val_accuracy: 0.8998 - 424ms/epoch - 3ms/step Epoch 6/10 132/132 - 0s - loss: 0.7766 - accuracy: 0.8872 - val_loss: 1.4954 - val_accuracy: 0.8340 - 394ms/epoch - 3ms/step Epoch 7/10 132/132 - 0s - loss: 0.7512 - accuracy: 0.8893 - val_loss: 0.8706 - val_accuracy: 0.9017 - 440ms/epoch - 3ms/step Epoch 8/10 132/132 - 1s - loss: 0.5363 - accuracy: 0.8951 - val_loss: 0.4010 - val_accuracy: 0.8989 - 554ms/epoch - 4ms/step Epoch 9/10 132/132 - 0s - loss: 2.7934 - accuracy: 0.8598 - val_loss: 0.7920 - val_accuracy: 0.8979 - 441ms/epoch - 3ms/step Epoch 10/10 132/132 - 0s - loss: 0.5542 - accuracy: 0.8901 - val_loss: 2.2528 - val_accuracy: 0.8760 - 392ms/epoch - 3ms/step 132/132 [==============================] - 0s 3ms/step - loss: 1.7045 - accuracy: 0.8803 33/33 [==============================] - 0s 3ms/step - loss: 2.2528 - accuracy: 0.8760
| Split | Layers-Nodes | Time | Train Accuracy | Train loss | Validation Accuracy | Validation loss | |
|---|---|---|---|---|---|---|---|
| 0 | 1 | 2-10 | 00:00:05 | 0.857007 | 0.415866 | 0.854485 | 0.463632 |
| 1 | 2 | 2-10 | 00:00:10 | 0.889313 | 0.58068 | 0.900787 | 0.700455 |
| 2 | 3 | 2-10 | 00:00:14 | 0.90458 | 1.696702 | 0.897448 | 2.410543 |
| 3 | 4 | 2-10 | 00:00:18 | 0.903626 | 0.569752 | 0.896971 | 0.577795 |
| 4 | 5 | 2-10 | 00:00:24 | 0.875954 | 2.252834 | 0.880277 | 1.704512 |
# Plot the accuracy curves for training and validation
plt.plot(history.history['accuracy'], color='b', label="Training accuracy")
plt.plot(history.history['val_accuracy'], color='r',label="Validation accuracy")
legend = plt.legend(loc='best', shadow=True)
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.show()
# Plot the loss curve
plt.plot(history.history['loss'], color='b', label="Training")
plt.plot(history.history['val_loss'], color='r',label="Validation")
legend = plt.legend(loc='best', shadow=True)
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.show();
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_val_fold, np.argmax(model.predict(X_val_fold),axis = 1))
cm_df = pd.DataFrame(cm,
index = range(0,2),
columns = range(0,2))
#Plotting the confusion matrix
plt.figure(figsize=(5,5))
sns.heatmap(cm_df, annot=True, fmt = 'g')
plt.title('Confusion Matrix')
plt.ylabel('Actal Values')
plt.xlabel('Predicted Values')
plt.show();
33/33 [==============================] - 0s 2ms/step
# Plot the accuracy curves for training and validation
plt.plot(history.history['accuracy'], color='b', label="Training accuracy")
plt.plot(history.history['val_accuracy'], color='r',label="Validation accuracy")
legend = plt.legend(loc='best', shadow=True)
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.show()
# Plot the loss curve
plt.plot(history.history['loss'], color='b', label="Training")
plt.plot(history.history['val_loss'], color='r',label="Validation")
legend = plt.legend(loc='best', shadow=True)
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.show();
#create ctm_class as a copy of ctm_dt before applying get_dummies
customer_class1 = customer_dt.copy()
customer_class1['CustomerID']=customer_class1['CustomerID'].astype(int).astype('object')
customer_class1['NextPurchaseDayRange'] = 0 ## > than 3 months
customer_class1.loc[(customer_class1.NextPurchaseDay>0)&(customer_class1.NextPurchaseDay<=30),'NextPurchaseDayRange'] = 1 # month 1
customer_class1.loc[(customer_class1.NextPurchaseDay>30)&(customer_class1.NextPurchaseDay<=60),'NextPurchaseDayRange'] = 2 # month 2
customer_class1.loc[(customer_class1.NextPurchaseDay>60)&(customer_class1.NextPurchaseDay<=90),'NextPurchaseDayRange'] = 3 # month 3
customer_class1.head()
X1 = customer_class1.drop(columns = ['NextPurchaseDayRange','NextPurchaseDay','CustomerID'])
y1 = customer_class1['NextPurchaseDayRange']
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size=0.2, random_state=None, shuffle=True)
logit1 = LogisticRegression().fit(np.array(X_train1), np.array(y_train1))
| CustomerID | Age | member | non-member | premium | NextPurchaseDay | Recency | Frequency | Revenue | Cluster | NextPurchaseDayRange | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 13085 | 22.0 | 1 | 0 | 0 | 9999.0 | 54 | 84 | 2433.28 | 0 | 0 |
| 1 | 13078 | 34.0 | 0 | 1 | 0 | 20.0 | 3 | 801 | 29532.45 | 3 | 1 |
| 2 | 15362 | 31.0 | 0 | 1 | 0 | 9999.0 | 345 | 40 | 613.08 | 1 | 0 |
| 3 | 18102 | 31.0 | 1 | 0 | 0 | 27.0 | 23 | 1058 | 608821.65 | 2 | 1 |
| 4 | 12682 | 32.0 | 1 | 0 | 0 | 21.0 | 3 | 989 | 21693.91 | 3 | 1 |
# Store the Selected Pickle files
import pickle
# Save the log_reg_pred_y object as a pickle file
with open('propensity_model_90days.pkl', 'wb') as f:
pickle.dump(log_reg_pred_y, f)
# Save the refined_xgb_model object as a pickle file
with open('propensity_model_monthly.pkl', 'wb') as f:
pickle.dump(logit1, f)
==============================================================================================================